library(readr)
rent <- read_csv("student_predictions.csv",
col_types = cols(
`Posted On` = col_date(format = "%m/%d/%Y"),
Size = col_integer(),
Unit.Floor = col_integer(),
Total.Floors = col_integer()
)
)
colnames(rent)
## [1] "Posted On" "BHK" "Size"
## [4] "Unit.Floor" "Total.Floors" "Area.Type"
## [7] "City" "Area Locality" "Furnishing.Status"
## [10] "Tenant.Preferred" "Bathroom" "Point.of.Contact"
## [13] "Rent" "Marie" "Lucas"
## [16] "Grace" "Bethany" "Jenna"
## [19] "Emma" "Natalie" "Hannah"
## [22] "Chris" "Jacob"
predictions <- rent[, -(1:13)]
head(predictions)
## # A tibble: 6 × 10
## Marie Lucas Grace Bethany Jenna Emma Natalie Hannah Chris Jacob
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5667. -857. 4100 9095. 5818. 5339. 8881. 5254. 6463 5258.
## 2 8232. 11922. 8300 9215. 8604. 8231. 8881. 8065. 8782 8230.
## 3 34172. 49947. 37000 15422. 44414. 29004. 28625. 36119. 31304 33567.
## 4 5463. 5221. 6700 8688. 6917. 6197. 8881. 6088. 7340 6240.
## 5 15285. 44453. 14500 17856. 15088. 14840. 15597. 14900. 15266 14797.
## 6 7238. 10185. 7700 9388. 8189. 7896. 8881. 7732. 8469 7838.
summary(predictions)
## Marie Lucas Grace Bethany
## Min. : 3380 Min. : -1378 Min. : 2700 Min. : 0
## 1st Qu.: 11167 1st Qu.: 22961 1st Qu.: 12100 1st Qu.: 11280
## Median : 16141 Median : 41226 Median : 15550 Median : 17152
## Mean : 32248 Mean : 47279 Mean : 31518 Mean : 31105
## 3rd Qu.: 32527 3rd Qu.: 60849 3rd Qu.: 31450 3rd Qu.: 26773
## Max. :581209 Max. :189988 Max. :468700 Max. :1278290
## Jenna Emma Natalie Hannah
## Min. : 5226 Min. : 4159 Min. : 8881 Min. : 0
## 1st Qu.: 11613 1st Qu.: 11291 1st Qu.: 13090 1st Qu.: 11511
## Median : 15921 Median : 15477 Median : 15598 Median : 15765
## Mean : 32503 Mean : 32277 Mean : 36880 Mean : 31634
## 3rd Qu.: 32623 3rd Qu.: 32165 3rd Qu.: 31692 3rd Qu.: 32527
## Max. :796643 Max. :776983 Max. :609500 Max. :660605
## Chris Jacob
## Min. : 6393 Min. : 4133
## 1st Qu.: 11235 1st Qu.: 11366
## Median : 15266 Median : 15515
## Mean : 34211 Mean : 32170
## 3rd Qu.: 31049 3rd Qu.: 32236
## Max. :1879887 Max. :818594
Missing value imputation
library(dplyr)
median_rent <- median(rent$Rent)
predictions <- predictions %>%
mutate(
Lucas = ifelse(
Lucas < 0,
median_rent,
Lucas
)
) %>%
mutate(
Bethany = ifelse(
Bethany < 1,
median_rent,
Bethany
)
) %>%
mutate(
Hannah = ifelse(
Hannah < 1,
median_rent,
Hannah
)
)
summary(predictions)
## Marie Lucas Grace Bethany
## Min. : 3380 Min. : 11.24 Min. : 2700 Min. : 7853
## 1st Qu.: 11167 1st Qu.: 22960.78 1st Qu.: 12100 1st Qu.: 11531
## Median : 16141 Median : 41225.89 Median : 15550 Median : 17152
## Mean : 32248 Mean : 47417.04 Mean : 31518 Mean : 31238
## 3rd Qu.: 32527 3rd Qu.: 60849.42 3rd Qu.: 31450 3rd Qu.: 26773
## Max. :581209 Max. :189987.58 Max. :468700 Max. :1278290
## Jenna Emma Natalie Hannah
## Min. : 5226 Min. : 4159 Min. : 8881 Min. : 4460
## 1st Qu.: 11613 1st Qu.: 11291 1st Qu.: 13090 1st Qu.: 11565
## Median : 15921 Median : 15477 Median : 15598 Median : 15818
## Mean : 32503 Mean : 32277 Mean : 36880 Mean : 31678
## 3rd Qu.: 32623 3rd Qu.: 32165 3rd Qu.: 31692 3rd Qu.: 32527
## Max. :796643 Max. :776983 Max. :609500 Max. :660605
## Chris Jacob
## Min. : 6393 Min. : 4133
## 1st Qu.: 11235 1st Qu.: 11366
## Median : 15266 Median : 15515
## Mean : 34211 Mean : 32170
## 3rd Qu.: 31049 3rd Qu.: 32236
## Max. :1879887 Max. :818594
Calculating RMSE
library(Metrics)
actual_rent_log <- log10(rent$Rent)
predictions <- log10(as.matrix(predictions))
RMSEs <- apply(
predictions,
MARGIN = 2,
FUN = function(x) {
Metrics::rmse(actual_rent_log, x)
}
)
RMSE <- round(sort(RMSEs), 4)
df <- as.data.frame(RMSE)
df$name <- names(RMSE)
# preserve the order in plot
df$name <- factor(df$name, levels = rev(df$name))
library(ggplot2)
ggplot(df, aes(x = name, y = RMSE)) +
geom_segment(
aes(x = name, xend = name, y = 0, yend = RMSE),
color = "gray",
lwd = 2
) +
geom_point(
size = 4,
pch = 21,
bg = "red",
col = "red"
) +
geom_text(
aes(label = RMSE),
color = "blue",
size = 3,
nudge_y = .02
) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y = element_blank()
)+
coord_flip()

Marie’s model
library(mgcv)
library(nlme)
gen.am <- gam(
logRent ~ s(logSize,k=11) + City + Furnishing.Status + Bathroom
+ BHK + Area.Type + Point.of.Contact +Total.Floors,
data = rent.practice
)
gen.am
#R-sq.(adj) = 0.821
Hannah’s model
fit2 <- lm(logrent ~ BHK + Size + City + `Furnishing Status`
+ Bathroom +`Point of Contact`,
data = rent2
)
summary(fit2)
#Adjusted R-squared: 0.8099
Emma’s model
model <- lm(Log.Rent ~ City + BHK + Size + Tenant.Preferred +
Furnishing.Status + Point.of.Contact + Num.Floors,
data = rent5)
#Adjusted R-squared: 0.8119
Jacob’s model
lm <- lm(rent.log$Rent ~ rent.log[, 2] + rent.log[, 3] +
rent.log[, 7] + rent.log[, 8] + rent.log[, 11])
#Size + City + BHK + Contact + Furnishing Status ???
# Adjusted R-squred: 0.8056
Ensemble approach (many models vote)
predictions_subset <- subset(
predictions,
select = c(Marie, Hannah, Emma, Jacob)
)
ensembl_prediction <- apply(
predictions_subset,
MARGIN = 1,
FUN = mean
)
Metrics::rmse(actual_rent_log, ensembl_prediction)
## [1] 0.1638336
library(corrplot)
df <- cbind(predictions, rent = actual_rent_log)
corrplot.mixed(cor(df), order = "AOE")

dist_pearson <- function(x, ...)
as.dist(1-cor(t(x), method="pearson"))
plot(
hclust(
dist_pearson(t(df)),
method = "average"
),
xlab = NULL
)

library(reshape2)
wide_data = cbind(t(df), names = colnames(df))
long_data <- melt(wide_data, id.vars = c("names") )
colnames(long_data)[1] = "name"
head(long_data)
## name Var2 value
## 1 Marie 3.75332455600047
## 2 Lucas 4.21748394421391
## 3 Grace 3.61278385671974
## 4 Bethany 3.95881311230356
## 5 Jenna 3.76477910405509
## 6 Emma 3.72742909811498
long_data$value <- as.numeric(long_data$value)
ggplot(long_data, aes(x=name, y=value, color = name, fill = name)) +
geom_violin(trim=FALSE, show.legend = FALSE)

ggplot(long_data, aes(x=name, y=value, color = name, fill = name)) +
geom_violin(trim=FALSE, show.legend = FALSE) +
ylim(3.5, 6)

Factors
| Size |
O |
O |
O |
O |
O |
O |
| City |
O |
O |
O |
O |
O |
O |
| Contact |
O |
O |
O |
O |
O |
O |
| BHK |
O |
O |
O |
O |
O |
O |
| Bathroom |
O |
O |
- |
O |
O |
O |
| Furnishing |
O |
O |
O |
- |
O |
O |
| Total.Floors |
O |
- |
O |
- |
- |
O |
| Area.Type |
O |
- |
- |
- |
- |
- |
| Tenant |
- |
- |
O |
- |
- |
- |
data_all <- cbind(
rent[, 1:12],
Rent = log10(rent$Rent),
predictions
)
#write.csv(data_all, "model_performance.csv", row.names = FALSE)
ggplot(data_all,
aes(
x = Rent,
y = Marie,
color = City
)) +
geom_point() +
xlim(3.4, 5.7) +
ylim(3.4, 5.7) +
xlab("Actual Rent (log10)") +
geom_abline(intercept = 0, slope = 1, size = 0.5)

library(plotly)
p <- ggplot(data_all,
aes(
x = Rent,
y = Marie,
color = City
)) +
geom_point() +
xlim(3.4, 5.7) +
ylim(3.4, 5.7) +
xlab("Actual Rent (log10)") +
geom_abline(intercept = 0, slope = 1, size = 0.5)
ggplotly(p)